Importing necessary packages


In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

Count Vectorizer turns sentences into word counts


In [4]:
corpus = ['This is first sentence', 'Here is the second sentence', 'Third sentence']

In [5]:
count_vec = CountVectorizer()
features = count_vec.fit_transform(corpus)

In [6]:
pd.DataFrame(features.todense(), columns=count_vec.get_feature_names())


Out[6]:
first here is second sentence the third this
0 1 0 1 0 1 0 0 1
1 0 1 1 1 1 1 0 0
2 0 0 0 0 1 0 1 0

TFIDF Vectorizer turns sentences into vectors using probabilities


In [7]:
tfidf = TfidfVectorizer()
features_tfidf = tfidf.fit_transform(corpus)

In [8]:
pd.DataFrame(features_tfidf.todense(), columns=tfidf.get_feature_names())


Out[8]:
first here is second sentence the third this
0 0.584483 0.000000 0.444514 0.000000 0.345205 0.000000 0.000000 0.584483
1 0.000000 0.504611 0.383770 0.504611 0.298032 0.504611 0.000000 0.000000
2 0.000000 0.000000 0.000000 0.000000 0.508542 0.000000 0.861037 0.000000